In [1]:
%pylab inline
from astropy.io import fits
from sklearn.ensemble import ExtraTreesRegressor
import pickle


Populating the interactive namespace from numpy and matplotlib
/home/rybizki/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/rybizki/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: numpy.dtype size changed, may indicate binary incompatibility. Expected 96, got 88
  return f(*args, **kwds)
/home/rybizki/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d

In [2]:
gdr2val = fits.getdata('../output/GDR2_207/GDR2_207_cleaned_0.0025sampling_validation.fits')

In [3]:
gdr2 = fits.getdata("../output/GDR2_207/GDR2_207_cleaned_0.0025sampling.fits")
# cleaning nan parallax errors
pe = gdr2.parallax_error
clean = ~np.isnan(pe)
print(len(gdr2))
gdr2 = gdr2[clean]
print(len(gdr2))
print(gdr2.dtype.names)


3134770
2925796
('solution_id', 'designation', 'source_id', 'random_index', 'ref_epoch', 'ra', 'ra_error', 'dec', 'dec_error', 'parallax', 'parallax_error', 'parallax_over_error', 'pmra', 'pmra_error', 'pmdec', 'pmdec_error', 'ra_dec_corr', 'ra_parallax_corr', 'ra_pmra_corr', 'ra_pmdec_corr', 'dec_parallax_corr', 'dec_pmra_corr', 'dec_pmdec_corr', 'parallax_pmra_corr', 'parallax_pmdec_corr', 'pmra_pmdec_corr', 'astrometric_n_obs_al', 'astrometric_n_obs_ac', 'astrometric_n_good_obs_al', 'astrometric_n_bad_obs_al', 'astrometric_gof_al', 'astrometric_chi2_al', 'astrometric_excess_noise', 'astrometric_excess_noise_sig', 'astrometric_params_solved', 'astrometric_primary_flag', 'astrometric_weight_al', 'astrometric_pseudo_colour', 'astrometric_pseudo_colour_error', 'mean_varpi_factor_al', 'astrometric_matched_observations', 'visibility_periods_used', 'astrometric_sigma5d_max', 'frame_rotator_object_type', 'matched_observations', 'duplicated_source', 'phot_g_n_obs', 'phot_g_mean_flux', 'phot_g_mean_flux_error', 'phot_g_mean_flux_over_error', 'phot_g_mean_mag', 'phot_bp_n_obs', 'phot_bp_mean_flux', 'phot_bp_mean_flux_error', 'phot_bp_mean_flux_over_error', 'phot_bp_mean_mag', 'phot_rp_n_obs', 'phot_rp_mean_flux', 'phot_rp_mean_flux_error', 'phot_rp_mean_flux_over_error', 'phot_rp_mean_mag', 'phot_bp_rp_excess_factor', 'phot_proc_mode', 'bp_rp', 'bp_g', 'g_rp', 'radial_velocity', 'radial_velocity_error', 'rv_nb_transits', 'rv_template_teff', 'rv_template_logg', 'rv_template_fe_h', 'phot_variable_flag', 'l', 'b', 'ecl_lon', 'ecl_lat', 'priam_flags', 'teff_val', 'teff_percentile_lower', 'teff_percentile_upper', 'a_g_val', 'a_g_percentile_lower', 'a_g_percentile_upper', 'e_bp_min_rp_val', 'e_bp_min_rp_percentile_lower', 'e_bp_min_rp_percentile_upper', 'flame_flags', 'radius_val', 'radius_percentile_lower', 'radius_percentile_upper', 'lum_val', 'lum_percentile_lower', 'lum_percentile_upper', 'datalink_url', 'epoch_photometry_url')

In [4]:
def gmagerror(flux,fluxerror):
    """
    calculates the symmetric gmag error from fluxes, only good approximation for low values
    """
    def flux2mag(f):
        return(-2.5*np.log10(f)+25.688365)
    gp = flux2mag(flux + fluxerror)
    gm = flux2mag(flux - fluxerror)
    return(np.divide(gm-gp,2))

In [5]:
# training vpu and gnobs on l and b
g = gdr2.phot_g_mean_mag
bprp = gdr2.phot_bp_mean_mag - gdr2.phot_rp_mean_mag
l = gdr2.l#gdr2.ecl_lon#gdr2.l
b = gdr2.b#gdr2.ecl_lat#np.abs(np.sin(np.divide(gdr2.ecl_lat,np.pi/180.)))#gdr2.b
pe = gdr2.parallax_error
vp = gdr2.visibility_periods_used
gn = gdr2.phot_g_n_obs
f = gdr2.phot_g_mean_flux
fe = gdr2.phot_g_mean_flux_error
ge = gmagerror(f,fe)
#rve = gdr2.radial_velocity_error
X = np.vstack((l,b)).T
y = np.vstack((vp,gn)).T
model = ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=None,
                            min_samples_split=5, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', 
                            max_leaf_nodes=None, min_impurity_decrease=0.0, 
                            min_impurity_split=None, bootstrap=True, oob_score=True,
                            n_jobs=1, random_state=None, verbose=0, warm_start=False)
model.fit(X,y)

Xval = np.vstack((gdr2val.l,gdr2val.b)).T
pe = gdr2val.parallax_error
vp = gdr2val.visibility_periods_used
gn = gdr2val.phot_g_n_obs
ge = gdr2val.phot_g_mean_flux_error

y_val = np.vstack((vp,gn)).T
y_pred = model.predict(Xval)
print(model.feature_importances_)

names = ['visibility_periods_used','g_n_obs']
for i,item in enumerate(names):
    print(i,item)
    plt.plot(y_val[:,i],y_pred[:,i], ',', alpha = 0.1)
    plt.xlabel(item + '_real')
    plt.ylabel(item + '_predicted')
    plt.yscale("log")
    plt.xscale("log")
    plt.show()
    plt.close()
filename = "lb2vpunobs_model"
pickle.dump(model,open(filename,'wb'))


/home/rybizki/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py:724: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
[0.56669839 0.43330161]
0 visibility_periods_used
1 g_n_obs

In [6]:
vpu_val = y_pred[:,0]
nobs_val = y_pred[:,1]

# training parallax_error and gmagnitude error on vpu, gnobs, g, bp-rp
g = gdr2.phot_g_mean_mag
bprp = gdr2.phot_bp_mean_mag - gdr2.phot_rp_mean_mag
l = gdr2.ecl_lon#gdr2.l
b = gdr2.ecl_lat#np.abs(np.sin(np.divide(gdr2.ecl_lat,np.pi/180.)))#gdr2.b
pe = gdr2.parallax_error
vp = gdr2.visibility_periods_used
gn = gdr2.phot_g_n_obs
f = gdr2.phot_g_mean_flux
fe = gdr2.phot_g_mean_flux_error
ge = gmagerror(f,fe)
#rve = gdr2.radial_velocity_error
X = np.vstack((g,bprp,vp,gn)).T
y = np.vstack((pe,ge)).T
model = ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=None,
                            min_samples_split=5, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', 
                            max_leaf_nodes=None, min_impurity_decrease=0.0, 
                            min_impurity_split=None, bootstrap=True, oob_score=True,
                            n_jobs=1, random_state=None, verbose=0, warm_start=False)
model.fit(X,y)

Xval = np.vstack((gdr2val.phot_g_mean_mag,gdr2val.phot_bp_mean_mag-gdr2val.phot_rp_mean_mag,
                 vpu_val,nobs_val)).T
pe = gdr2val.parallax_error
vp = gdr2val.visibility_periods_used
gn = gdr2val.phot_g_n_obs
ge = gmagerror(gdr2val.phot_g_mean_flux,gdr2val.phot_g_mean_flux_error)

y_val = np.vstack((pe,ge)).T
y_pred = model.predict(Xval)
print(model.feature_importances_)

names = ['parallax_error','g_mag_error']
for i,item in enumerate(names):
    print(i,item)
    plt.plot(y_val[:,i],y_pred[:,i], ',', alpha = 0.1)
    plt.xlabel(item + '_real')
    plt.ylabel(item + '_predicted')
    plt.yscale("log")
    plt.xscale("log")
    plt.show()
    plt.close()
filename = "gbprpvpunobs2pege_model"
pickle.dump(model,open(filename,'wb'))


/home/rybizki/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py:724: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
[0.78760275 0.03202318 0.14678199 0.03359208]
0 parallax_error
1 g_mag_error

In [7]:
# radial_velocity_error on g, bp-rp, teff
cut = np.isnan(gdr2.radial_velocity)
print(len(gdr2))
gdr2 = gdr2[~cut]
print(len(gdr2))
cut = np.isnan(gdr2.teff_val)
print(len(gdr2))
gdr2 = gdr2[~cut]
print(len(gdr2))
g = gdr2.phot_g_mean_mag
bprp = gdr2.phot_bp_mean_mag - gdr2.phot_rp_mean_mag
te = gdr2.teff_val
rve = gdr2.radial_velocity_error
#rve = gdr2.radial_velocity_error
X = np.vstack((g,bprp,te)).T
y = rve
model = ExtraTreesRegressor(n_estimators=10, criterion='mse', max_depth=None,
                            min_samples_split=5, min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='auto', 
                            max_leaf_nodes=None, min_impurity_decrease=0.0, 
                            min_impurity_split=None, bootstrap=True, oob_score=True,
                            n_jobs=1, random_state=None, verbose=0, warm_start=False)
model.fit(X,y)
cut = np.isnan(gdr2val.radial_velocity)
print(len(gdr2val))
gdr2val = gdr2val[~cut]
print(len(gdr2val))
cut = np.isnan(gdr2val.teff_val)
print(len(gdr2val))
gdr2val = gdr2val[~cut]
print(len(gdr2val))
Xval = np.vstack((gdr2val.phot_g_mean_mag,gdr2val.phot_bp_mean_mag-gdr2val.phot_rp_mean_mag,
                 gdr2val.teff_val)).T
rve = gdr2val.radial_velocity_error
y_val = rve
y_pred = model.predict(Xval)
print(model.feature_importances_)

plt.plot(y_val,y_pred, ',', alpha = 0.1)
plt.xlabel('rve_real')
plt.ylabel('rve_predicted')
plt.yscale("log")
plt.xscale("log")
plt.show()
plt.close()
filename = "gbprpteff2rvse_model"
pickle.dump(model,open(filename,'wb'))


2925796
18046
18046
18046
/home/rybizki/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py:724: UserWarning: Some inputs do not have OOB scores. This probably means too few trees were used to compute any reliable oob estimates.
  warn("Some inputs do not have OOB scores. "
2927847
18085
18085
18085
[0.44214761 0.27722553 0.28062686]

In [8]:
g = gdr2.phot_bp_mean_mag
f = gdr2.phot_bp_mean_flux
fe = gdr2.phot_bp_mean_flux_error
t = -2.5*np.log10(f)
print(t)
print(g)
print(t-g)
def flux2mag(flux):
    return(-2.5*np.log10(flux)+25.351388)
gp = flux2mag(f + fe)
gm = flux2mag(f - fe)
d1 = gp-g
d2 = g-gm
# assymetry in magnitude error
plt.plot(d1,d2,'.', alpha = 0.1)
plt.plot([-0.14,0],[-0.14,0])
plt.xlim((-0.03,0.0))
plt.ylim((-0.03,0.0))
bpme = np.divide(gm-gp,2)


[-12.58035663 -11.62211968 -12.56008775 ... -12.03549994 -11.6300478
 -10.82810035]
[12.771031 13.729268 12.791301 ... 13.315888 13.72134  14.523288]
[-25.35138801 -25.35138776 -25.35138853 ... -25.35138835 -25.35138798
 -25.35138812]

In [9]:
g = gdr2.phot_rp_mean_mag
f = gdr2.phot_rp_mean_flux
fe = gdr2.phot_rp_mean_flux_error
t = -2.5*np.log10(f)
print(t)
print(g)
print(t-g)
def flux2mag(flux):
    return(-2.5*np.log10(flux)+24.7619)
gp = flux2mag(f + fe)
gm = flux2mag(f - fe)
d1 = gp-g
d2 = g-gm
# assymetry in magnitude error
plt.plot(d1,d2,'.', alpha = 0.1)
plt.plot([-0.14,0],[-0.14,0])
plt.xlim((-0.03,0.0))
plt.ylim((-0.03,0.0))
rpme = np.divide(gm-gp,2)


[-13.17764883 -12.69350459 -13.36425374 ... -13.31469947 -12.60579287
 -12.13491794]
[11.584271 12.068416 11.397666 ... 11.447221 12.156127 12.627002]
[-24.76192026 -24.76192023 -24.76191972 ... -24.76192027 -24.76191984
 -24.7619197 ]

In [10]:
plt.plot(rpme,g,',', alpha = 0.1)
plt.yscale('log')
plt.xscale('log')



In [11]:
nobs = np.genfromtxt('errors/nobs.txt', names = True)
scaling_factor_dr2 = 0.37
number_obs = np.round(scaling_factor_dr2*np.interp(np.abs(np.sin(gdr2.ecl_lat)),nobs['sinbeta'],nobs['N_obs']))
plt.plot(number_obs,gdr2.phot_g_n_obs,'.',alpha = 0.01)
plt.plot([6,22],[6,22],)
plt.xlabel('_real')
plt.ylabel('_predicted')
plt.yscale("log")
plt.xscale("log")
plt.show()
plt.close()